{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Machine Learning models as APIs using Flask\n",
    "\n",
    "### 1. Python Environment Setup & Flask Basics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Creating a Machine Learning Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/zhaoyadong/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "import os \n",
    "import json\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.externals import joblib\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "from sklearn.pipeline import make_pipeline\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 数据集：训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test.csv     training.csv\n"
     ]
    }
   ],
   "source": [
    "!ls ../data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv('../data/training.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Loan_ID',\n",
       " 'Gender',\n",
       " 'Married',\n",
       " 'Dependents',\n",
       " 'Education',\n",
       " 'Self_Employed',\n",
       " 'ApplicantIncome',\n",
       " 'CoapplicantIncome',\n",
       " 'LoanAmount',\n",
       " 'Loan_Amount_Term',\n",
       " 'Credit_History',\n",
       " 'Property_Area',\n",
       " 'Loan_Status']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(614, 13)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 找到列中的缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of null values in:Loan_ID == 0\n",
      "The number of null values in:Gender == 13\n",
      "The number of null values in:Married == 3\n",
      "The number of null values in:Dependents == 15\n",
      "The number of null values in:Education == 0\n",
      "The number of null values in:Self_Employed == 32\n",
      "The number of null values in:ApplicantIncome == 0\n",
      "The number of null values in:CoapplicantIncome == 0\n",
      "The number of null values in:LoanAmount == 22\n",
      "The number of null values in:Loan_Amount_Term == 14\n",
      "The number of null values in:Credit_History == 50\n",
      "The number of null values in:Property_Area == 0\n",
      "The number of null values in:Loan_Status == 0\n"
     ]
    }
   ],
   "source": [
    "for _ in data.columns:\n",
    "    print(\"The number of null values in:{} == {}\".format(_, data[_].isnull().sum()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 创建 `training` 和 `testing` 数据集:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\\\n",
    "            'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(data[pred_var], \n",
    "                                                    data['Loan_Status'],\n",
    "                                                    test_size=0.25, \n",
    "                                                    random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 把处理过程写到一个类中`pre-processing` \n",
    "\n",
    "__custom pre-processing Scikit-learn `estimator`__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "class PreProcessing(BaseEstimator, TransformerMixin):\n",
    "    \"\"\"Custom Pre-Processing estimator for our use-case\n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def transform(self, df):\n",
    "        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome',\\\n",
    "                    'CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']\n",
    "        \n",
    "        df = df[pred_var]\n",
    "        \n",
    "        df['Dependents'] = df['Dependents'].fillna(0)\n",
    "        df['Self_Employed'] = df['Self_Employed'].fillna('No')\n",
    "        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)\n",
    "        df['Credit_History'] = df['Credit_History'].fillna(1)\n",
    "        df['Married'] = df['Married'].fillna('No')\n",
    "        df['Gender'] = df['Gender'].fillna('Male')\n",
    "        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)\n",
    "        \n",
    "        gender_values = {'Female' : 0, 'Male' : 1} \n",
    "        married_values = {'No' : 0, 'Yes' : 1}\n",
    "        education_values = {'Graduate' : 0, 'Not Graduate' : 1}\n",
    "        employed_values = {'No' : 0, 'Yes' : 1}\n",
    "        property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}\n",
    "        dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}\n",
    "        \n",
    "        df.replace(\n",
    "            {\n",
    "                'Gender': gender_values, \n",
    "                'Married': married_values, \n",
    "                'Education': education_values,\n",
    "                'Self_Employed': employed_values, \n",
    "                'Property_Area': property_values,\n",
    "                'Dependents': dependent_values           \n",
    "            }, inplace=True\n",
    "        )\n",
    "        \n",
    "        return df.as_matrix()\n",
    "\n",
    "    def fit(self, df, y=None, **fit_params):        \n",
    "        self.term_mean_ = df['Loan_Amount_Term'].mean()\n",
    "        self.amt_mean_ = df['LoanAmount'].mean()\n",
    "        return self"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 把`y_train` 和 `y_test` 转换后才 `np.array`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()\n",
    "y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用管道去确保整个数据预处理流程做到一个`scikit-learn estimator`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pipe = make_pipeline(PreProcessing(),RandomForestClassifier())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "         steps=[('preprocessing', PreProcessing()),\n",
       "                ('randomforestclassifier',\n",
       "                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n",
       "                                        class_weight=None, criterion='gini',\n",
       "                                        max_depth=None, max_features='auto',\n",
       "                                        max_leaf_nodes=None, max_samples=None,\n",
       "                                        min_impurity_decrease=0.0,\n",
       "                                        min_impurity_split=None,\n",
       "                                        min_samples_leaf=1, min_samples_split=2,\n",
       "                                        min_weight_fraction_leaf=0.0,\n",
       "                                        n_estimators=100, n_jobs=None,\n",
       "                                        oob_score=False, random_state=None,\n",
       "                                        verbose=0, warm_start=False))],\n",
       "         verbose=False)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 使用网格搜索法选择参数\n",
    "\n",
    "使用`Grid Search`，搜索最佳`hyper-parameters` (`degree` for `PolynomialFeatures` & `alpha` for `Ridge`):\n",
    "\n",
    "- 定义`param_grid`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "param_grid = {\"randomforestclassifier__n_estimators\" : [10, 20, 30],\n",
    "             \"randomforestclassifier__max_depth\" : [None, 6, 8, 10],\n",
    "             \"randomforestclassifier__max_leaf_nodes\": [None, 5, 10, 20], \n",
    "             \"randomforestclassifier__min_impurity_split\": [0.1, 0.2, 0.3]}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 运行`Grid Search`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grid = GridSearchCV(pipe, param_grid=param_grid, cv=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 训练模型 `pipeline estimator`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score=nan,\n",
       "             estimator=Pipeline(memory=None,\n",
       "                                steps=[('preprocessing', PreProcessing()),\n",
       "                                       ('randomforestclassifier',\n",
       "                                        RandomForestClassifier(bootstrap=True,\n",
       "                                                               ccp_alpha=0.0,\n",
       "                                                               class_weight=None,\n",
       "                                                               criterion='gini',\n",
       "                                                               max_depth=None,\n",
       "                                                               max_features='auto',\n",
       "                                                               max_leaf_nodes=None,\n",
       "                                                               max_samples=None,\n",
       "                                                               min_impurity_decrease=0.0,\n",
       "                                                               min_impurity_split=None,\n",
       "                                                               min_sampl...\n",
       "                                verbose=False),\n",
       "             iid='deprecated', n_jobs=None,\n",
       "             param_grid={'randomforestclassifier__max_depth': [None, 6, 8, 10],\n",
       "                         'randomforestclassifier__max_leaf_nodes': [None, 5, 10,\n",
       "                                                                    20],\n",
       "                         'randomforestclassifier__min_impurity_split': [0.1,\n",
       "                                                                        0.2,\n",
       "                                                                        0.3],\n",
       "                         'randomforestclassifier__n_estimators': [10, 20, 30]},\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring=None, verbose=0)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 查看Grid Search选择的最佳参数和分数:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_leaf_nodes': None, 'randomforestclassifier__min_impurity_split': 0.3, 'randomforestclassifier__n_estimators': 10}\n"
     ]
    }
   ],
   "source": [
    "print(\"Best parameters: {}\".format(grid.best_params_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation set score: 0.77\n"
     ]
    }
   ],
   "source": [
    "print(\"Validation set score: {:.2f}\".format(grid.score(X_test, y_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 加载测试集:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_df = pd.read_csv('../data/test.csv', encoding=\"utf-8\")\n",
    "test_df = test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Loan_ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Married</th>\n",
       "      <th>Dependents</th>\n",
       "      <th>Education</th>\n",
       "      <th>Self_Employed</th>\n",
       "      <th>ApplicantIncome</th>\n",
       "      <th>CoapplicantIncome</th>\n",
       "      <th>LoanAmount</th>\n",
       "      <th>Loan_Amount_Term</th>\n",
       "      <th>Credit_History</th>\n",
       "      <th>Property_Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LP001015</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>5720</td>\n",
       "      <td>0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LP001022</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>1</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>3076</td>\n",
       "      <td>1500</td>\n",
       "      <td>126.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LP001031</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>2</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>5000</td>\n",
       "      <td>1800</td>\n",
       "      <td>208.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LP001035</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>2</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>2340</td>\n",
       "      <td>2546</td>\n",
       "      <td>100.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LP001051</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Not Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>3276</td>\n",
       "      <td>0</td>\n",
       "      <td>78.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Loan_ID Gender Married Dependents     Education Self_Employed  \\\n",
       "0  LP001015   Male     Yes          0      Graduate            No   \n",
       "1  LP001022   Male     Yes          1      Graduate            No   \n",
       "2  LP001031   Male     Yes          2      Graduate            No   \n",
       "3  LP001035   Male     Yes          2      Graduate            No   \n",
       "4  LP001051   Male      No          0  Not Graduate            No   \n",
       "\n",
       "   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \\\n",
       "0             5720                  0       110.0             360.0   \n",
       "1             3076               1500       126.0             360.0   \n",
       "2             5000               1800       208.0             360.0   \n",
       "3             2340               2546       100.0             360.0   \n",
       "4             3276                  0        78.0             360.0   \n",
       "\n",
       "   Credit_History Property_Area  \n",
       "0             1.0         Urban  \n",
       "1             1.0         Urban  \n",
       "2             1.0         Urban  \n",
       "3             NaN         Urban  \n",
       "4             1.0         Urban  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 1, 1, 1, 1])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid.predict(test_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### __Serialize the Machine Learning Model__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 保存机器学习模型：序列化和反序列化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "python 中一般是使用pickle模块来实现序列化和反序列化：\n",
    "\n",
    "- 序列化是指将一个对象转换为一个能够存储在一个文件中或者网络上进行传输的字节流的过程。\n",
    "- 反序列化指的是相反的过程，它是从字节流中提取对象的过程。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "list_to_pickle = [1, 'here', 123, 'walker']\n",
    "\n",
    "#Pickling the list\n",
    "import pickle\n",
    "\n",
    "# 序列化\n",
    "list_pickle = pickle.dumps(list_to_pickle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "b'\\x80\\x03]q\\x00(K\\x01X\\x04\\x00\\x00\\x00hereq\\x01K{X\\x06\\x00\\x00\\x00walkerq\\x02e.'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When we load the pickle back:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 反序列化\n",
    "loaded_pickle = pickle.loads(list_pickle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 'here', 123, 'walker']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loaded_pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "NOTE：\n",
    "* 在我们实际部署机器学习模型的过程中，一般是把训练好的模型序列化到一个文件夹中(一般使用pickle和h5py)\n",
    "* dill将python用于序列化和反序列化python对象的pickle模块扩展到大多数内置python类型。比如嵌套函数类型的对象pickle不可以存储，但dill可以。dill提供和pickle相同的接口，使用时，“import dill as pickle”即可。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: http://mirrors.aliyun.com/pypi/simple/\n",
      "Requirement already satisfied: dill in /Users/zhaoyadong/anaconda3/lib/python3.6/site-packages (0.3.1.1)\n"
     ]
    }
   ],
   "source": [
    "!pip install dill"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import dill as pickle\n",
    "filename = 'model_v2.pk'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open('../flask_api/models/'+filename, 'wb') as file:\n",
    "    pickle.dump(grid, file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这样我们就将我们训练的最佳模型给序列化出来了`model_v2.pk`,我们在Flask使用前先测试下。。。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 1, 1, 1, 1])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('../flask_api/models/'+filename ,'rb') as f:\n",
    "    loaded_model = pickle.load(f)\n",
    "loaded_model.predict(test_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Creating an API using Flask"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们的文件目录结构如下图所示：\n",
    "\n",
    "![Folder Struct](images/flaskapp3.png)\n",
    "\n",
    "There are three important parts in constructing our wrapper function, **`apicall()`**:\n",
    "\n",
    "- Getting the **`request`** data (for which predictions are to be made)\n",
    "\n",
    "- Loading our **`pickled estimator`**\n",
    "\n",
    "- **`jsonify`** our predictions and send the response back with **`status code: 200`**\n",
    "\n",
    "HTTP messages are made of a header and a body. As a standard, majority of the body content sent across are in **`json`** format. We'll be sending (**`POST url-endpoint/`**) the incoming data as batch to get predictions.\n",
    "\n",
    "(__NOTE:__ You can send plain **text, XML, csv or image** directly but for the sake of interchangeability of the format, it is advisable to use **`json`**)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```python\n",
    "\"\"\"Filename: server.py\n",
    "\"\"\"\n",
    "\n",
    "import os\n",
    "import pandas as pd\n",
    "from sklearn.externals import joblib\n",
    "from flask import Flask, jsonify, request\n",
    "\n",
    "app = Flask(__name__)\n",
    "\n",
    "@app.route('/predict', methods=['POST'])\n",
    "def apicall():\n",
    "\t\"\"\"API Call\n",
    "\t\n",
    "\tPandas dataframe (sent as a payload) from API Call\n",
    "\t\"\"\"\n",
    "\ttry:\n",
    "\t\ttest_json = request.get_json()\n",
    "\t\ttest = pd.read_json(test_json, orient='records')\n",
    "\n",
    "\t\t#To resolve the issue of TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'\n",
    "\t\ttest['Dependents'] = [str(x) for x in list(test['Dependents'])]\n",
    "\n",
    "\t\t#Getting the Loan_IDs separated out\n",
    "\t\tloan_ids = test['Loan_ID']\n",
    "\n",
    "\texcept Exception as e:\n",
    "\t\traise e\n",
    "\t\n",
    "\tclf = 'model_v2.pk'\n",
    "\t\n",
    "\tif test.empty:\n",
    "\t\treturn(bad_request())\n",
    "\telse:\n",
    "\t\t#Load the saved model\n",
    "\t\tprint(\"Loading the model...\")\n",
    "\t\tloaded_model = None\n",
    "\t\twith open('./models/'+clf,'rb') as f:\n",
    "\t\t\tloaded_model = pickle.load(f)\n",
    "\n",
    "\t\tprint(\"The model has been loaded...doing predictions now...\")\n",
    "\t\tpredictions = loaded_model.predict(test)\n",
    "\t\t\n",
    "\t\t\"\"\"Add the predictions as Series to a new pandas dataframe\n",
    "\t\t\t\t\t\t\t\tOR\n",
    "\t\t   Depending on the use-case, the entire test data appended with the new files\n",
    "\t\t\"\"\"\n",
    "\t\tprediction_series = list(pd.Series(predictions))\n",
    "\n",
    "\t\tfinal_predictions = pd.DataFrame(list(zip(loan_ids, prediction_series)))\n",
    "\t\t\n",
    "\t\t\"\"\"We can be as creative in sending the responses.\n",
    "\t\t   But we need to send the response codes as well.\n",
    "\t\t\"\"\"\n",
    "\t\tresponses = jsonify(predictions=final_predictions.to_json(orient=\"records\"))\n",
    "\t\tresponses.status_code = 200\n",
    "\n",
    "\t\treturn (responses)\n",
    "\n",
    "```\n",
    "\n",
    "Once done, run: `gunicorn --bind 0.0.0.0:8000 server:app`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's generate some prediction data and query the API running locally at \n",
    "\n",
    "`https:0.0.0.0:8000/predict`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Setting the headers to send and accept json responses\\n'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'Reading test batch\\n'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'Converting Pandas Dataframe to json\\n'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"Setting the headers to send and accept json responses\n",
    "\"\"\"\n",
    "header = {'Content-Type': 'application/json', \n",
    "          'Accept': 'application/json'}\n",
    "\n",
    "\"\"\"Reading test batch\n",
    "\"\"\"\n",
    "df = pd.read_csv('../data/test.csv', encoding=\"utf-8-sig\")\n",
    "df = df.head()\n",
    "\n",
    "\"\"\"Converting Pandas Dataframe to json\n",
    "\"\"\"\n",
    "data = df.to_json(orient='records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[{\"Loan_ID\":\"LP001015\",\"Gender\":\"Male\",\"Married\":\"Yes\",\"Dependents\":\"0\",\"Education\":\"Graduate\",\"Self_Employed\":\"No\",\"ApplicantIncome\":5720,\"CoapplicantIncome\":0,\"LoanAmount\":110.0,\"Loan_Amount_Term\":360.0,\"Credit_History\":1.0,\"Property_Area\":\"Urban\"},{\"Loan_ID\":\"LP001022\",\"Gender\":\"Male\",\"Married\":\"Yes\",\"Dependents\":\"1\",\"Education\":\"Graduate\",\"Self_Employed\":\"No\",\"ApplicantIncome\":3076,\"CoapplicantIncome\":1500,\"LoanAmount\":126.0,\"Loan_Amount_Term\":360.0,\"Credit_History\":1.0,\"Property_Area\":\"Urban\"},{\"Loan_ID\":\"LP001031\",\"Gender\":\"Male\",\"Married\":\"Yes\",\"Dependents\":\"2\",\"Education\":\"Graduate\",\"Self_Employed\":\"No\",\"ApplicantIncome\":5000,\"CoapplicantIncome\":1800,\"LoanAmount\":208.0,\"Loan_Amount_Term\":360.0,\"Credit_History\":1.0,\"Property_Area\":\"Urban\"},{\"Loan_ID\":\"LP001035\",\"Gender\":\"Male\",\"Married\":\"Yes\",\"Dependents\":\"2\",\"Education\":\"Graduate\",\"Self_Employed\":\"No\",\"ApplicantIncome\":2340,\"CoapplicantIncome\":2546,\"LoanAmount\":100.0,\"Loan_Amount_Term\":360.0,\"Credit_History\":null,\"Property_Area\":\"Urban\"},{\"Loan_ID\":\"LP001051\",\"Gender\":\"Male\",\"Married\":\"No\",\"Dependents\":\"0\",\"Education\":\"Not Graduate\",\"Self_Employed\":\"No\",\"ApplicantIncome\":3276,\"CoapplicantIncome\":0,\"LoanAmount\":78.0,\"Loan_Amount_Term\":360.0,\"Credit_History\":1.0,\"Property_Area\":\"Urban\"}]'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"POST <url>/predict\n",
    "\"\"\"\n",
    "resp = requests.post(\"http://0.0.0.0:8000/predict\", \\\n",
    "                    data = json.dumps(data),\\\n",
    "                    headers= header)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resp.status_code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'predictions': '[{\"0\":\"LP001015\",\"1\":1},{\"0\":\"LP001022\",\"1\":1},{\"0\":\"LP001031\",\"1\":1},{\"0\":\"LP001035\",\"1\":1},{\"0\":\"LP001051\",\"1\":1}]'}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"The final response we get is as follows:\n",
    "\"\"\"\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}